其他
两种方法批量做TCGA生存分析
大神一句话,菜鸟跑半年。我不是大神,但我可以缩短你走弯路的半年~
就像歌儿唱的那样,如果你不知道该往哪儿走,就留在这学点生信好不好~
这里有豆豆和花花的学习历程,从新手到进阶,生信路上有你有我!
花花写于2020-01-07
本系列是我的TCGA
学习记录,跟着生信技能树B站课程学的,已获得授权(嗯,真的^_^)。课程链接:https://www.bilibili.com/video/av49363776
目录:
1.准备R包
1if(!require(survival))install.packages("survival")
2if(!require(survminer))install.packages("survminer")
3if(!require(stringr))install.packages("stringr")
2.输入数据
需要表达矩阵expr和临床信息meta。
(变量名无实际意义,但不要轻易修改,改了前面就要改后面,流程很长,改起来麻烦)
1rm(list=ls())
2options(stringsAsFactors = F)
3Rdata_dir='./Rdata/'
4Figure_dir='./figures/'
5# 加载上一步从RTCGA.miRNASeq包里面提取miRNA表达矩阵和对应的样本临床信息。
6load( file =
7 file.path(Rdata_dir,'TCGA-KIRC-miRNA-example.Rdata')
8)
9dim(expr)
10#> [1] 552 593
11dim(meta)
12#> [1] 537 8
13group_list=ifelse(as.numeric(substr(colnames(expr),14,15)) < 10,'tumor','normal')
14table(group_list)
15#> group_list
16#> normal tumor
17#> 71 522
3.整理表达矩阵和临床信息
这里做生存分析,已经不需要正常样本的表达矩阵了,所以需要过滤。临床信息需要进行整理。
1exprSet=na.omit(expr)
2exprSet=exprSet[,group_list=='tumor']
3dim(exprSet)
4#> [1] 552 522
5str(meta)
6#> 'data.frame': 537 obs. of 8 variables:
7#> $ patient.bcr_patient_barcode : chr "tcga-3z-a93z" "tcga-6d-aa2e" "tcga-a3-3306" "tcga-a3-3307" ...
8#> $ patient.vital_status : chr "alive" "alive" "alive" "alive" ...
9#> $ patient.days_to_death : chr NA NA NA NA ...
10#> $ patient.days_to_last_followup : chr "4" "135" "1120" "1436" ...
11#> $ patient.race : chr "black or african american" "black or african american" "white" NA ...
12#> $ patient.age_at_initial_pathologic_diagnosis: chr "69" "68" "67" "66" ...
13#> $ patient.gender : chr "male" "female" "male" "male" ...
14#> $ patient.stage_event.pathologic_stage : chr "stage i" "stage i" "stage i" "stage iii" ...
15colnames(meta)=c('ID','event','death','last_followup','race','age','gender','stage')
16#调整ID和表达矩阵内容和顺序都一样
17head(meta$ID)
18#> [1] "tcga-3z-a93z" "tcga-6d-aa2e" "tcga-a3-3306" "tcga-a3-3307" "tcga-a3-3308"
19#> [6] "tcga-a3-3311"
20meta$ID=str_to_upper(meta$ID)
21meta=meta[match(substr(colnames(exprSet),1,12),meta$ID),]
22head(meta$ID)
23#> [1] "TCGA-A3-3307" "TCGA-A3-3308" "TCGA-A3-3311" "TCGA-A3-3313" "TCGA-A3-3316"
24#> [6] "TCGA-A3-3317"
25head(colnames(exprSet))
26#> [1] "TCGA-A3-3307-01A-01T-0860-13" "TCGA-A3-3308-01A-02R-1324-13"
27#> [3] "TCGA-A3-3311-01A-02R-1324-13" "TCGA-A3-3313-01A-02R-1324-13"
28#> [5] "TCGA-A3-3316-01A-01T-0860-13" "TCGA-A3-3317-01A-01T-0860-13"
4.整理生存分析输入数据
1#1.1由随访时间和死亡时间计算生存时间
2meta[,3][is.na(meta[,3])]=0
3meta[,4][is.na(meta[,4])]=0
4meta$days=as.numeric(meta[,3])+as.numeric(meta[,4])
5#时间以月份记
6meta$time=meta$days/30
7#1.2 根据生死定义event,活着是0,死的是1
8meta$event=ifelse(meta$event=='alive',0,1)
9table(meta$event)
10#>
11#> 0 1
12#> 364 158
13#1.3 年龄和年龄分组
14meta$age=as.numeric(meta$age)
15meta$age_group=ifelse(meta$age>median(meta$age),'older','younger')
16table(meta$age_group)
17#>
18#> older younger
19#> 244 278
20#1.4 stage
21library(stringr)
22meta$stage=str_split(meta$stage,' ',simplify = T)[,2]
23table(meta$stage)
24#>
25#> i ii iii iv
26#> 258 57 124 83
27#1.5 race
28table(meta$race)
29#>
30#> asian black or african american white
31#> 8 58 448
5.生存分析绘图
5.1 简单绘图(性别)
1library(survival)
2library(survminer)
3sfit <- survfit(Surv(time, event)~gender, data=meta)
4ggsurvplot(sfit, conf.int=F, pval=TRUE)
1ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
2 risk.table =TRUE,pval =TRUE,
3 conf.int =TRUE,xlab ="Time in months",
4 ggtheme =theme_light(),
5 ncensor.plot = TRUE)
5.2 多个生存分析拼图(性别和年龄)
1sfit1=survfit(Surv(time, event)~gender, data=meta)
2sfit2=survfit(Surv(time, event)~age_group, data=meta)
3splots <- list()
4splots[[1]] <- ggsurvplot(sfit1,pval =TRUE, data = meta, risk.table = TRUE)
5splots[[2]] <- ggsurvplot(sfit2,pval =TRUE, data = meta, risk.table = TRUE)
6arrange_ggsurvplots(splots, print = TRUE, ncol = 2, nrow = 1, risk.table.height = 0.4)
1dev.off()
2#> RStudioGD
3#> 2
可以很明显看到,肿瘤病人的生存受着诊断癌症的年龄的影响,却与性别无关。
5.3 挑选感兴趣的(多个)基因做生存分析
来自于文章:2015-TCGA-ccRCC-5-miRNAs-signatures
Integrated genomic analysis identifies subclasses and prognosis signatures of kidney cancer
miR-21,miR-143,miR-10b,miR-192,miR-183
1gs=c('hsa-mir-21','hsa-mir-143','hsa-mir-192',
2 'hsa-mir-183')
3splots <- lapply(gs, function(g){
4 meta$gene=ifelse(exprSet[g,]>median(exprSet[g,]),'high','low')
5 sfit1=survfit(Surv(time, event)~gene, data=meta)
6 ggsurvplot(sfit1,pval =TRUE, data = meta, risk.table = TRUE)
7})
8arrange_ggsurvplots(splots, print = TRUE,
9 ncol = 2, nrow = 2, risk.table.height = 0.4)
6. 批量生存分析
6.1 log-rank方法
对每个基因(在这个例子里是miRNA)求了p值。
1mySurv=with(meta,Surv(time, event))
2log_rank_p <- apply(exprSet , 1 , function(gene){
3 # gene=exprSet[1,]
4 meta$group=ifelse(gene>median(gene),'high','low')
5 data.survdiff=survdiff(mySurv~group,data=meta)
6 p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
7 return(p.val)
8})
9log_rank_p=sort(log_rank_p)
10table(log_rank_p<0.01)
11#>
12#> FALSE TRUE
13#> 464 88
14table(log_rank_p<0.05)
15#>
16#> FALSE TRUE
17#> 401 151
结果是88个基因的p<0.01,151个基因的p<0.05。
6.2 cox方法
1cox_results <-apply(exprSet , 1 , function(gene){
2 # gene= exprSet[1,]
3 group=ifelse(gene>median(gene),'high','low')
4 survival_dat <- data.frame(group=group,stage=meta$stage,age=meta$age,
5 gender=meta$gender,
6 stringsAsFactors = F)
7 m=coxph(mySurv ~ gender + age + stage+ group, data = survival_dat)
8
9 beta <- coef(m)
10 se <- sqrt(diag(vcov(m)))
11 HR <- exp(beta)
12 HRse <- HR * se
13
14 #summary(m)
15 tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
16 HR = HR, HRse = HRse,
17 HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
18 HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
19 HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
20 return(tmp['grouplow',])
21
22})
23cox_results=t(cox_results)
24table(cox_results[,4]<0.01)
25#>
26#> FALSE TRUE
27#> 492 60
28table(cox_results[,4]<0.05)
29#>
30#> FALSE TRUE
31#> 428 124
结果是60个基因的p<0.01,124个基因的p<0.05。
7.对生存影响显著的基因可视化(热图和PCA)
将p<0.05的基因表达矩阵筛选出来,作图看看
1library(pheatmap)
2choose_gene_rank=names(log_rank_p[log_rank_p<0.05])
3choose_matrix_rank=expr[choose_gene_rank,]
4source("3-plotfunction.R")
5
6h1 = draw_heatmap(choose_matrix_rank,group_list)
7p1 = draw_pca(log(choose_matrix_rank+1),group_list)
8
9choose_gene_cox=rownames(cox_results[cox_results[,4]<0.05,])
10choose_matrix_cox=expr[choose_gene_cox,]
11
12h2 = draw_heatmap(choose_matrix_cox,group_list)
13p2 = draw_pca(log(choose_matrix_cox+1),group_list)
14library(patchwork)
15(h1 + h2)/(p1 + p2)
这里的聚类分组不在一起也是正常的,因为选的不是差异基因而是对生死影响显著的基因,和是否癌症组织有一定关系,但不是绝对关系。
插个小广告!
再给生信技能树打个call!